Dataset: adult_modified.csv Dataset description here
# Import Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import cross_validation
# Change working directory
os.chdir('/Users/lavinia/Google Drive/18winter-textbooks/CSC478/Assignment2')
# Load file via read_csv
adult_df = pd.read_csv("adult-modified.csv")
adult_df.head(5)
Exploratory Data Analysis
Understand the dataset and its variables
# Size of the dataset
adult_df.shape
# Summarize the variables
adult_df.describe(include='all')
# Find missing values in the dataset
adult_df.isnull().sum()
Look at the csv file then we find out that missing valus are marked w/ '?'. Locate the columns with missing values.
adult_df.isin(['?']).any()
# Change missing numeric values to column mean
adult_df.age[adult_df['age'] == '?'] = int(adult_df.age[adult_df['age'] != '?'].astype(int).mean())
#adult_df.head(20)
# Replace the '?' with NA in the categorical columns
adult_dfNew = adult_df.replace('?', np.NaN)
# Drop the NAs
adult_dfNew = adult_dfNew.dropna()
adult_dfNew.shape
adult_dfNew.dtypes
adult_dfNew.age = adult_dfNew.age.apply(int)
adult_dfNew.describe(include='all')
# Histogram showing the distribution of age
adult_dfNew['age'].hist(grid = False)
# Histogram showing the distribution of education
adult_dfNew['education'].hist(grid = False)
# Histogram showing the distribution of hours-per-week
adult_dfNew['hours-per-week'].hist(grid = False)
fig = plt.figure(figsize=(10,10))
fig.subplots_adjust(hspace=.5)
ax1 = fig.add_subplot(221)
ax1.set_xlabel('Workclass')
ax1.set_ylabel('Count')
ax1.set_title("Workclass Distribution")
adult_dfNew['workclass'].value_counts().plot(kind = 'bar', grid = False)
ax1 = fig.add_subplot(222)
ax1.set_xlabel('Marital Status')
ax1.set_ylabel('Count')
ax1.set_title("Martial Status Distribution")
adult_dfNew['marital-status'].value_counts().plot(kind = 'bar', grid = False)
ax1 = fig.add_subplot(223)
ax1.set_xlabel('Race')
ax1.set_ylabel('Count')
ax1.set_title("Race Distribution")
adult_dfNew['race'].value_counts().plot(kind='bar', grid = False)
ax1 = fig.add_subplot(224)
ax1.set_xlabel('Sex')
ax1.set_ylabel('Count')
ax1.set_title("Sex Distribution")
adult_dfNew['sex'].value_counts().plot(kind='bar', grid = False)
fig2 = plt.figure(figsize=(10,10))
fig2.subplots_adjust(hspace=.5)
ax1 = fig2.add_subplot(221)
ax1.set_xlabel('Income')
ax1.set_ylabel('Count')
ax1.set_title("Income Distribution")
adult_dfNew['income'].value_counts().plot(kind='bar', grid = False)
def generate_crosstab(first, second):
cross_tab = pd.crosstab(adult_dfNew[first], adult_dfNew[second])
plt.show(cross_tab.plot(kind="bar"))
generate_crosstab('education', 'race')
generate_crosstab('workclass', 'income')
generate_crosstab('workclass', 'race')
generate_crosstab('race', 'income')
# Create a table or chart showing percentages of each race category that fall in the low-income group.
pd.crosstab(adult_dfNew.race, adult_dfNew.income).apply(lambda x: x/x.sum(), axis = 1)
So for each of the race we can see that most of the records n this data set are under low-income category. If we talk about each race, for the race Amer-Indian 90% of them are under low-income and around 10% are under high-income category, 77% of Asian are under low income category while 23% of them are under high-income category. For Black 87% are under low-income group and 13% are under high-income. For Hispanic 92% have income less than or equal to 50k so they are under low-income group while only 8% are in high-income category. For White 73% are under low-income category and 27% are under high-income category. Among all the races, in terms of race, highest percentage of people in low-income category are Hispanic and highest percentage of people under high-income category are White.
# Workclass vs income
pd.crosstab(adult_dfNew.workclass, adult_dfNew.income).apply(lambda x: x/x.sum(), axis = 1)
# Workclass vs income
pd.crosstab(adult_dfNew.workclass, adult_dfNew.income).apply(lambda x: x/x.sum(), axis = 1)
# Marital vs income
pd.crosstab(adult_dfNew['marital-status'], adult_dfNew.income).apply(lambda x: x/x.sum(), axis = 1)
# Sex vs income
pd.crosstab(adult_dfNew.sex, adult_dfNew.income).apply(lambda x: x/x.sum(), axis = 1)
From the analysis above, we see that members of the private class are most likely to make less than 50k, whereas those who are self employed are more likely to make over 50k. Even more notably, single people are signifcantly more likely to make less than 50k (93.33%) than married people (57.63%). Finally, 30% of males make more than 50k where only 11.89% of females make more than 50k.
# Create dummy variables for the categorical attributes. Then separate the target attribute ("income>50K") from the attributes used for training.
adult_dfNew = pd.get_dummies(adult_dfNew)
adult_dfNew.head(5)
# Drop income <=50K
adult_dfNew = adult_dfNew.drop('income_<=50K', 1)
adult_dfNew.head(5)
# Seperate target attribute ("income>50K") from the attributes used for training.
y = adult_dfNew['income_>50K']
y.head()
# Remove target from rest of set
adult_dfNew = adult_dfNew.drop('income_>50K', 1)
adult_dfNew.head()
# (Gaussian) naive Bayes classifier
nbclf = naive_bayes.GaussianNB()
nbclf = nbclf.fit(adult_dfNew, y)
cv_scores = cross_validation.cross_val_score(nbclf, adult_dfNew, y, cv = 10)
cv_scores
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
# Decision tree classifier
treeclf = tree.DecisionTreeClassifier(criterion = 'entropy', random_state = 9)
treeclf = treeclf.fit(adult_dfNew, y)
cv_scores = cross_validation.cross_val_score(treeclf, adult_dfNew, y, cv = 10)
cv_scores
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
# Linear Discriminant Analysis(LDA)
ldclf = LinearDiscriminantAnalysis()
ldclf = ldclf.fit(adult_dfNew, y)
cv_scores = cross_validation.cross_val_score(ldclf, adult_dfNew, y, cv = 10)
cv_scores
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
from sklearn.tree import export_graphviz
export_graphviz(treeclf,out_file='tree.dot', feature_names = adult_dfNew.columns)
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
system(dot -Tpng tree.dot -o dtree.png)
from IPython.display import Image
Image(filename='dtree.png', width=800)